#column_classes = c("numeric","factor","factor","factor","factor","Date","numeric","numeric","numeric","numeric","numeric","numeric")
album_list_2015 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2015-Table 1.csv")
album_list_2015 = album_list_2015[,c(1:12)] #extra column was deleted
album_list_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2016-Table 1.csv")
album_list_2016 = album_list_2016[,c(1:12)] #extra column was deleted
album_list_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2017-Table 1.csv")
album_list_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2018-Table 1.csv")
#convert columns to correct data types
album_list_2015 = album_list_2015 %>%
convert(num(Rank),fct(Title,Artist,Label,Core.Genre))
album_list_2015 = album_list_2015 %>% mutate(Release.Date = as.Date(Release.Date,
format = "%m/%d/%Y"))
#use gsub to get rid of commas in numeric values
album_list_2015 = album_list_2015 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
Albums.Sales...YTD = as.numeric(gsub(",", "",Albums.Sales...YTD)),
Physical.Albums.Sales...YTD = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
holiday.year = rep(2015,dim(album_list_2015)[1]),
release.year = year(Release.Date)
)
album_list_2016 = album_list_2016 %>%
convert(num(Rank),fct(Title,Artist,Label,Core.Genre))
album_list_2016 = album_list_2016 %>% mutate(Release.Date = as.Date(Release.Date,
format = "%m/%d/%Y"))
album_list_2016 = album_list_2016 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
Albums.Sales...YTD = as.numeric(gsub(",", "",Albums.Sales...YTD)),
Physical.Albums.Sales...YTD = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
holiday.year = rep(2016,dim(album_list_2016)[1]),
release.year = year(Release.Date)
)
album_list_2017 = album_list_2017 %>%
convert(num(Rank),fct(Title,Artist,Label,Core.Genre))
album_list_2017 = album_list_2017 %>% mutate(Release.Date = as.Date(Release.Date,
format = "%m/%d/%Y"))
album_list_2017 = album_list_2017 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
Albums.Sales...YTD = as.numeric(gsub(",", "",Albums.Sales...YTD)),
Physical.Albums.Sales...YTD = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
holiday.year = rep(2017,dim(album_list_2017)[1]),
release.year = year(Release.Date)
)
album_list_2018 = album_list_2018 %>%
convert(num(Rank),fct(Title,Artist,Label,Core.Genre))
## Warning in as_reliable_num(.): NAs introduced by coercion
album_list_2018 = album_list_2018 %>% mutate(Release.Date = as.Date(Release.Date,
format = "%m/%d/%Y"))
album_list_2018 = album_list_2018 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
Albums.Sales...YTD = as.numeric(gsub(",", "",Albums.Sales...YTD)),
Physical.Albums.Sales...YTD = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
holiday.year = rep(2018,dim(album_list_2018)[1]),
release.year = year(Release.Date)
)
album_list_df = as.data.frame(rbind(album_list_2015,album_list_2016,album_list_2017,album_list_2018))
daily_holiday_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2016.csv")
daily_holiday_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2017.csv")
daily_holiday_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2018.csv")
daily_holiday_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2019.csv")
daily_holiday_2016 = daily_holiday_2016 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_holiday_2016 = daily_holiday_2016 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("holiday",dim(daily_holiday_2016)[1]) #binary variable to indicate not industry data
)
daily_holiday_2017 = daily_holiday_2017 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_holiday_2017 = daily_holiday_2017 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("holiday",dim(daily_holiday_2017)[1]) #binary variable to indicate not industry data
)
daily_holiday_2018 = daily_holiday_2018 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_holiday_2018 = daily_holiday_2018 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("holiday",dim(daily_holiday_2018)[1]) #binary variable to indicate not industry data
)
daily_holiday_2019 = daily_holiday_2019 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_holiday_2019 = daily_holiday_2019 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("holiday",dim(daily_holiday_2019)[1]) #binary variable to indicate not industry data
)
daily_industry_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2016.csv")
daily_industry_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2017.csv")
daily_industry_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2018.csv")
daily_industry_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2019.csv")
daily_industry_2016 = daily_industry_2016 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_industry_2016 = daily_industry_2016 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("industry",dim(daily_industry_2016)[1]) #binary variable to indicate it is industry data
)
daily_industry_2017 = daily_industry_2017 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_industry_2017 = daily_industry_2017 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("industry",dim(daily_industry_2017)[1]) #binary variable to indicate it is industry data
)
daily_industry_2018 = daily_industry_2018 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_industry_2018 = daily_industry_2018 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("industry",dim(daily_industry_2018)[1]) #binary variable to indicate it is industry data)
)
daily_industry_2019 = daily_industry_2019 %>% mutate(Date = as.Date(Date,
format = "%m/%d/%Y"))
daily_industry_2019 = daily_industry_2019 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
day = day(Date),
year = year(Date),
industry = rep("industry",dim(daily_industry_2019)[1]) #binary variable to indicate it is industry data
)
daily_holiday_industry = as.data.frame(rbind(daily_holiday_2016,daily_holiday_2017,daily_holiday_2018,daily_holiday_2019,daily_industry_2016,daily_industry_2017,daily_industry_2018,daily_industry_2019))
weekly_holiday_2015 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2015.csv")
weekly_holiday_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2016.csv")
weekly_holiday_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2017.csv")
weekly_holiday_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2018.csv")
weekly_holiday_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2019.csv")
weekly_holiday_2015 = weekly_holiday_2015 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("holiday",dim(weekly_holiday_2015)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_holiday_2015)[1]),
year = rep(2015,dim(weekly_holiday_2015)[1])
)
weekly_holiday_2016 = weekly_holiday_2016 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("holiday",dim(weekly_holiday_2016)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_holiday_2016)[1]),
year = rep(2016,dim(weekly_holiday_2016)[1])
)
weekly_holiday_2017 = weekly_holiday_2017 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("holiday",dim(weekly_holiday_2017)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_holiday_2017)[1]),
year = rep(2017,dim(weekly_holiday_2017)[1])
)
weekly_holiday_2018 = weekly_holiday_2018 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("holiday",dim(weekly_holiday_2018)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_holiday_2018)[1]),
year = rep(2018,dim(weekly_holiday_2018)[1])
)
weekly_holiday_2019 = weekly_holiday_2019 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("holiday",dim(weekly_holiday_2019)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_holiday_2019)[1]),
year = rep(2019,dim(weekly_holiday_2019)[1])
)
weekly_industry_2015 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2015.csv")
weekly_industry_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2016.csv")
weekly_industry_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2017.csv")
weekly_industry_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2018.csv")
weekly_industry_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2019.csv")
weekly_industry_2015 = weekly_industry_2015 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("industry",dim(weekly_industry_2015)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_industry_2015)[1]),
year = rep(2015,dim(weekly_industry_2015)[1])
)
weekly_industry_2016 = weekly_industry_2016 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("industry",dim(weekly_industry_2016)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_industry_2016)[1]),
year = rep(2016,dim(weekly_industry_2016)[1])
)
weekly_industry_2017 = weekly_industry_2017 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("industry",dim(weekly_industry_2017)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_industry_2017)[1]),
year = rep(2017,dim(weekly_industry_2017)[1])
)
weekly_industry_2018 = weekly_industry_2018 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("industry",dim(weekly_industry_2018)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_industry_2018)[1]),
year = rep(2018,dim(weekly_industry_2018)[1])
)
weekly_industry_2019 = weekly_industry_2019 %>% mutate(
Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
Albums.w..TEA = as.numeric(gsub(",", "",Albums.w..TEA)),
Total.Album.Sales = as.numeric(gsub(",", "",Total.Album.Sales)),
Physical.Albums.Sales = as.numeric(gsub(",", "",Physical.Albums.Sales)),
Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
industry = rep("industry",dim(weekly_industry_2019)[1]), #binary variable to indicate it is industry data,
week = c(1:dim(weekly_industry_2019)[1]),
year = rep(2019,dim(weekly_industry_2019)[1])
)
weekly_holiday_industry = as.data.frame(rbind(weekly_holiday_2015,weekly_holiday_2016,weekly_holiday_2017,weekly_holiday_2018,weekly_holiday_2019,weekly_industry_2015,weekly_industry_2016,weekly_industry_2017,weekly_industry_2018,weekly_industry_2019))
song_list_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2016-Table 1.csv")
song_list_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2017-Table 1.csv")
song_list_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2018-Table 1.csv")
song_list_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2019-Table 1.csv")
song_list_2016 = song_list_2016 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))
song_list_2017 = song_list_2017 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))
song_list_2018 = song_list_2018 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))
song_list_2019 = song_list_2019 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))
song_list_df = as.data.frame(rbind(song_list_2016,song_list_2017,song_list_2018,song_list_2019))
summary(album_list_2015)
## Rank Title Artist
## Min. : 1.00 Classic Christmas Album: 10 Bing Crosby : 13
## 1st Qu.: 50.75 Soundtrack : 7 Frank Sinatra : 13
## Median :100.50 Christmas Collection : 6 Elvis Presley : 11
## Mean :100.50 Christmas Album : 4 Mannheim Steamroller : 8
## 3rd Qu.:150.25 Home For Christmas : 4 Trans-Siberian Orchestra: 7
## Max. :200.00 Christmas : 3 Andy Williams : 5
## (Other) :166 (Other) :143
## Label Core.Genre Release.Date
## RCA : 25 Children : 1 Min. :1977-09-09
## COL : 20 Christian/Gospel: 1 1st Qu.:1998-07-12
## CAP : 16 Country : 2 Median :2006-09-27
## : 10 Holiday/Seasonal:194 Mean :2004-09-11
## WAR : 10 Pop : 1 3rd Qu.:2012-10-05
## INT : 9 Rock : 1 Max. :2016-11-25
## (Other):110
## Albums.w.TEA.w.SEA.On.Demand.Audio...YTD Albums.Sales...YTD
## Min. : 14035 Min. : 1
## 1st Qu.: 17634 1st Qu.: 4480
## Median : 25308 Median : 15296
## Mean : 32139 Mean : 19461
## 3rd Qu.: 39546 3rd Qu.: 27370
## Max. :162621 Max. :154952
## NA's :16
## Physical.Albums.Sales...YTD Digital.Albums.Sales...YTD
## Min. : 1 Min. : 1.0
## 1st Qu.: 3173 1st Qu.: 662.5
## Median : 13029 Median : 2382.0
## Mean : 16757 Mean : 4293.6
## 3rd Qu.: 25406 3rd Qu.: 5358.5
## Max. :120095 Max. :34857.0
## NA's :25 NA's :49
## Digital.Song.Sales...YTD Streaming.On.Demand.Audio...YTD holiday.year
## Min. : 970 Min. : 50139 Min. :2015
## 1st Qu.: 16970 1st Qu.: 3158242 1st Qu.:2015
## Median : 47930 Median : 8576470 Median :2015
## Mean : 65661 Mean :12136070 Mean :2015
## 3rd Qu.: 82338 3rd Qu.:18731002 3rd Qu.:2015
## Max. :547235 Max. :75282837 Max. :2015
## NA's :3 NA's :8
## release.year
## Min. :1977
## 1st Qu.:1998
## Median :2006
## Mean :2004
## 3rd Qu.:2012
## Max. :2016
##
# summary(album_list_2015$Artist)
summary(album_list_2015$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14035 17634 25308 32139 39546 162621
# summary(album_list_2016$Artist)
# summary(album_list_2017$Artist)
# summary(album_list_2018$Artist)
plot(album_list_2015$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD~album_list_2015$Artist)
plot(album_list_2015$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD ~ album_list_2015$Core.Genre)
plot(album_list_2015$Streaming.On.Demand.Audio...YTD ~ album_list_2015$Core.Genre)
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + labs(fill="Genre") + ylab("Total Album Equivalent Consumption") + ggtitle("Total Equivalent Consumption from 2016-2019 between Holiday and Other Genres")
ggplotly(g)
#shows dispraportion in holiday vs other music available at the time
# reasonable to say that there is more traffic towards non holiday music year round
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + xlab("Weeks")+ylab("Total Album Equivalent Consumption") + ggtitle("Yearly Total Equivalent Consumption from 2016-2019") + labs(fill="Genre")
# + ggtitle("daily industry vs holiday comparison from 2016-2019")
ggplotly(g)
#shows dispraportion in holiday vs other music (marked as industry) available at the time
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Total.Album.Sales,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + xlab("Weeks") + ggtitle("Total ALbum Sales from 2016-2019 between Holiday and Other Genres in the Industry")
ggplotly(g)
song_list_df %>% group_by(Artist, Song) %>% arrange(Rank, desc(Weeks.On.Chart)) %>% distinct(Artist)
## # A tibble: 367 x 2
## # Groups: Artist, Song [367]
## Artist Song
## <fct> <fct>
## 1 Mariah Carey All I Want For Christmas Is You
## 2 Brenda Lee Rockin' Around the Christmas Tree
## 3 Pentatonix Hallelujah
## 4 Bobby Helms Jingle Bell Rock
## 5 Andy Williams It's The Most Wonderful Time Of The Year
## 6 Burl Ives Have A Holly Jolly Christmas
## 7 Brenda Lee Rockin' Around The Christmas Tree
## 8 Burl Ives A Holly Jolly Christmas
## 9 Michael Buble It's Beginning To Look A Lot Like Christmas
## 10 Wham! Last Christmas
## # … with 357 more rows
album equiavalent unit: 1 album sale = 10 songs downloaded = 1500 streams tea = track equivalent album sea = streaming equivalent album on-demand (such as Amazon Music, Apple Music, Spotify and YouTube) programmed (such as Pandora and Slacker Radio)
There are older artists that show up consistently throughout 2015-2018 who have multiply works that are being listened to during the holidays
Holiday music within the past 15 years have been listened to the most
album_list_df = as.data.frame(rbind(album_list_2015,album_list_2016,album_list_2017,album_list_2018))
summary(album_list_df$release.year)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1954 1996 2004 2003 2012 2018 1
# holiday music that is still listened spans over 15 years
plot(album_list_2015$release.year)
plot(album_list_2016$release.year)
plot(album_list_2017$release.year)
plot(album_list_2018$release.year)
# shows that music being consumed over this 15 year span is pretty uniform
plot(album_list_df$release.year, ylab = "Release Years", xlab = "Albums from 2015-2018" , main = "Distribution of Music Consumed during 2015-2018")
#%>% top_n(10,album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)
album_list_df %>% group_by(Artist,Title) %>% arrange(desc(album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% distinct(Artist)
## # A tibble: 329 x 2
## # Groups: Artist, Title [329]
## Artist Title
## <fct> <fct>
## 1 Pentatonix Pentatonix Christmas
## 2 Merry Xmas Merry Xmas
## 3 Christmas Hits Christmas Hits
## 4 Michael Buble Christmas
## 5 Magic Christmas Magic Christmas
## 6 Pentatonix Christmas Is Here!
## 7 Garth Brooks Christmas Together
## 8 Frank Sinatra Holidays & Hits
## 9 Nat King Cole Holidays & Hits
## 10 Coolest Yule Ever Coolest Yule Ever
## # … with 319 more rows
# over 4 years, these artists have most album sales per total album equivalent consumption
album_list_df %>% group_by(Artist) %>% summarise(avg = mean(Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 179 x 2
## Artist avg
## <fct> <dbl>
## 1 Chipmunks 1
## 2 Countdown Kids 1
## 3 Various Artists 1
## 4 George Strait 1
## 5 Joey + Rory 0.994
## 6 Irish Tenors 0.992
## 7 Randy Travis 0.987
## 8 Neil Diamond 0.984
## 9 Mot Dominican Sisters Of Mary 0.983
## 10 Loretta Lynn 0.977
## # … with 169 more rows
#top artist based on different sales/streams
album_list_df %>% group_by(Artist) %>% summarise(avg = mean(Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 179 x 2
## Artist avg
## <fct> <dbl>
## 1 Chipmunks 1
## 2 Countdown Kids 1
## 3 Various Artists 1
## 4 George Strait 1
## 5 Irish Tenors 0.987
## 6 Joey + Rory 0.982
## 7 Randy Travis 0.969
## 8 Gaither Vocal Band 0.956
## 9 Chicago 0.943
## 10 Loretta Lynn 0.934
## # … with 169 more rows
album_list_df %>% group_by(Artist) %>% summarise(avg = mean(Digital.Albums.Sales...YTD /Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 179 x 2
## Artist avg
## <fct> <dbl>
## 1 Braxtons 0.510
## 2 Ariana Grande 0.466
## 3 George Winston 0.463
## 4 Taylor Swift 0.457
## 5 Leslie Odom Jr 0.432
## 6 Peter Hollens 0.382
## 7 Hanson 0.339
## 8 Seth MacFarlane 0.337
## 9 Kacey Musgraves 0.308
## 10 She & Him 0.281
## # … with 169 more rows
album_list_df %>% group_by(Artist) %>% summarise(avg = mean(Digital.Song.Sales...YTD /Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 179 x 2
## Artist avg
## <fct> <dbl>
## 1 Quink Vocal Ensemble 6.88
## 2 Twisted Sister 6.38
## 3 Ella Fitzgerald & Louis Armstrong 4.60
## 4 Willie Nelson 4.45
## 5 Fred Claus 4.03
## 6 Happy Holidays 3.84
## 7 Four Tops 3.83
## 8 Drifters 3.75
## 9 Ferrante & Teicher 3.08
## 10 Johnny Cash 2.89
## # … with 169 more rows
album_list_df %>% group_by(Artist) %>% summarise(avg = mean(Streaming.On.Demand.Audio...YTD /Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 179 x 2
## Artist avg
## <fct> <dbl>
## 1 Crosby, King 1388.
## 2 Christmas Stars 1359.
## 3 Chestnut, Cyrus & Friends 1356.
## 4 Spirit Of Christmas 1349.
## 5 Smcmg 2002 1334.
## 6 Tribute To Green Day 1318.
## 7 Silver Bells-Christmas Class 1306.
## 8 Nat King Cole,Bing Crosby,De 1305.
## 9 Holiday Favorites Series 1293.
## 10 Erran Baron Cohen 1292.
## # … with 169 more rows
# artists from before 2004
before_2004_index = ifelse(album_list_df$release.year<2004,1,0)
album_list_df$before_2004 = before_2004_index
album_list_df %>% group_by(Artist,release.year) %>% summarise(avg = mean(Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 303 x 3
## # Groups: Artist [179]
## Artist release.year avg
## <fct> <dbl> <dbl>
## 1 Chipmunks 2001 1
## 2 Countdown Kids 2014 1
## 3 Pentatonix 2017 1
## 4 Various Artists 2013 1
## 5 George Strait 2016 1
## 6 Irish Tenors 2009 0.987
## 7 Joey + Rory 2011 0.982
## 8 Randy Travis 2007 0.969
## 9 Trans-Siberian Orchestra 2012 0.960
## 10 Gaither Vocal Band 2015 0.956
## # … with 293 more rows
# shows that despite older musicians from past eras have more works to be listened to, consumers are still listening to specific artists regardless of number of published works
album_list_df %>% group_by(Title,Artist,release.year) %>% summarise(avg = mean(Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 329 x 4
## # Groups: Title, Artist [329]
## Title Artist release.year avg
## <fct> <fct> <dbl> <dbl>
## 1 Christmas Sing-along Various Artists 2013 1
## 2 Santa Claus Music Puzzle Countdown Kids 2014 1
## 3 Vol. 1-christmas With The Chip Chipmunks 2001 1
## 4 Strait For The Holidays George Strait 2016 1
## 5 That's Xmas To Me + Ptxmas Del Pentatonix 2017 1
## 6 Irish Tenors Christmas Irish Tenors 2009 0.987
## 7 A Farmhouse Christmas Joey + Rory 2011 0.982
## 8 Songs Of The Season Randy Travis 2007 0.969
## 9 Dreams Of Fireflies (on A Chri Trans-Siberian Orchestra 2012 0.960
## 10 Christmas Collection Gaither Vocal Band 2015 0.956
## # … with 319 more rows
album_list_df %>% group_by(Title,Artist,release.year) %>% summarise(avg = mean(Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 329 x 4
## # Groups: Title, Artist [329]
## Title Artist release.year avg
## <fct> <fct> <dbl> <dbl>
## 1 Christmas & Chill Ariana Grande 2015 0.529
## 2 Braxton Family Christmas Braxtons 2015 0.510
## 3 December George Winston 1983 0.463
## 4 The Holiday Collection Taylor Swift 2007 0.457
## 5 Simply Christmas Leslie Odom Jr 2016 0.432
## 6 Merry Christmas Johnny Mathis 1977 0.392
## 7 Home For Christmas Amy Grant 1992 0.390
## 8 Hollens Family Christmas Peter Hollens 2016 0.382
## 9 Wintersong Sarah McLachlan 2006 0.344
## 10 Christmas Kisses Ariana Grande 2013 0.340
## # … with 319 more rows
# older artists are streamed more often
album_list_df %>% group_by(Title,Artist,release.year) %>% summarise(avg = mean(Streaming.On.Demand.Audio...YTD /Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>% arrange(desc(avg))
## # A tibble: 329 x 4
## # Groups: Title, Artist [329]
## Title Artist release.year avg
## <fct> <fct> <dbl> <dbl>
## 1 Winter Wonderland Crosby, King 2004 1388.
## 2 I Wish You A Merry Christmas ( Bing Crosby 2011 1368.
## 3 Merry Christmas From Bing Cros Bing Crosby 2009 1368.
## 4 10 Great Christmas Songs Bing Crosby 2012 1366.
## 5 I Wish You A Merry Christmas Bing Crosby 2001 1364.
## 6 Christmas Stars Christmas Stars 1991 1359.
## 7 Charlie Brown Christmas Chestnut, Cyrus & Friends 2000 1356.
## 8 Bing Crosby's Christmas Classi Bing Crosby 1999 1354.
## 9 Spirit Of Christmas Spirit Of Christmas 1993 1349.
## 10 Complete Rca Christmas Col Perry Como 2013 1348.
## # … with 319 more rows
# even distribution of physical album sales across different release years, but there is an overall decrease in physical album sales
# aside from chipmunks that is from before 2004, the highest physical album sales are from albums after 2004
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + ylab("Physical Album Sale Proporition") + xlab("Artists") + labs(fill="< 2004") +
ggtitle("Distribution of Physical Album Sales between Old and New Holiday Music")
ggplotly(g)
# albums produced after 2004 are digitally purchased more often, but there is a steady decline of digital ownership
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + xlab("Artists") + labs(fill="< 2004") + ylab("Digital Album Sale Proportion") +
ggtitle("Distribution of Digital Album Sales between Old and New Holiday Music")
ggplotly(g)
# individual digital song sales see an equal distribution of music from before and after 2004
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Digital.Song.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Digital.Song.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge')+ xlab("Artists") + labs(fill="< 2004") + ylab("Digital Song Sale Proportion") +
ggtitle("Distribution of Digital Song Sales between Old and New Holiday Music")
ggplotly(g)
# streaming on demand sees an equal distribution of music from before and after 2004
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Streaming.On.Demand.Audio...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Streaming.On.Demand.Audio...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + xlab("Artists") + labs(fill="< 2004") + ylab("On Demand Streaming Proportion") + ggtitle("Distribution of On Demand Streaming between Old and New Holiday Music")
ggplotly(g)
before_2004_means = colMeans(album_list_df[album_list_df$before_2004==1,c(7:12)],na.rm = T)
after_2004_means = colMeans(album_list_df[album_list_df$before_2004==0,c(7:12)],na.rm = T)
# differences between music from before and after 2004
perc_difference_2004 = (after_2004_means-before_2004_means)/((before_2004_means+after_2004_means)/2) * 100 # percent difference
perc_difference_2004
## Albums.w.TEA.w.SEA.On.Demand.Audio...YTD
## 3.371509
## Albums.Sales...YTD
## 74.995332
## Physical.Albums.Sales...YTD
## 80.135553
## Digital.Albums.Sales...YTD
## 72.918539
## Digital.Song.Sales...YTD
## -4.755121
## Streaming.On.Demand.Audio...YTD
## -18.861186
perc_change_2004 = (after_2004_means-before_2004_means)/before_2004_means * 100 # percent change
perc_change_2004
## Albums.w.TEA.w.SEA.On.Demand.Audio...YTD
## 3.429319
## Albums.Sales...YTD
## 119.988050
## Physical.Albums.Sales...YTD
## 133.710294
## Digital.Albums.Sales...YTD
## 114.758736
## Digital.Song.Sales...YTD
## -4.644691
## Streaming.On.Demand.Audio...YTD
## -17.235753
t.test(album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD[which(album_list_df$before_2004==0)],na.rm=T))
##
## One Sample t-test
##
## data: album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD[which(album_list_df$before_2004 == 1)]
## t = -0.87817, df = 377, p-value = 0.3804
## alternative hypothesis: true mean is not equal to 62072.09
## 95 percent confidence interval:
## 55405.86 64622.17
## sample estimates:
## mean of x
## 60014.02
t.test(album_list_df$Digital.Albums.Sales...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Digital.Albums.Sales...YTD[which(album_list_df$before_2004==0)],na.rm=T))
##
## One Sample t-test
##
## data: album_list_df$Digital.Albums.Sales...YTD[which(album_list_df$before_2004 == 1)]
## t = -11.432, df = 242, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 5962.03
## 95 percent confidence interval:
## 2227.208 3325.097
## sample estimates:
## mean of x
## 2776.152
t.test(album_list_df$Streaming.On.Demand.Audio...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Streaming.On.Demand.Audio...YTD[which(album_list_df$before_2004==0)],na.rm=T))
##
## One Sample t-test
##
## data: album_list_df$Streaming.On.Demand.Audio...YTD[which(album_list_df$before_2004 == 1)]
## t = 3.411, df = 374, p-value = 0.0007178
## alternative hypothesis: true mean is not equal to 50735929
## 95 percent confidence interval:
## 55210963 67392530
## sample estimates:
## mean of x
## 61301747
#older music is streamed more
t.test(album_list_df$Physical.Albums.Sales...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Physical.Albums.Sales...YTD[which(album_list_df$before_2004==0)],na.rm=T))
##
## One Sample t-test
##
## data: album_list_df$Physical.Albums.Sales...YTD[which(album_list_df$before_2004 == 1)]
## t = -16.54, df = 339, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 23064.81
## 95 percent confidence interval:
## 8299.671 11438.276
## sample estimates:
## mean of x
## 9868.974
t.test(album_list_df$Digital.Song.Sales...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Digital.Song.Sales...YTD[which(album_list_df$before_2004==0)],na.rm=T))
##
## One Sample t-test
##
## data: album_list_df$Digital.Song.Sales...YTD[which(album_list_df$before_2004 == 1)]
## t = 1.0084, df = 375, p-value = 0.3139
## alternative hypothesis: true mean is not equal to 52597.42
## 95 percent confidence interval:
## 50163.78 60155.03
## sample estimates:
## mean of x
## 55159.4
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + facet_wrap(vars(holiday.year)) +xlab("Artists") +ylab("Physical Album Sale Proporition")+labs(fill="< 2004") +ggtitle("Distribution of Physical Album Sales between Old and New Holiday Music from 2015-2018")
ggplotly(g)
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + facet_wrap(vars(holiday.year)) +xlab("Artists") +ylab("Digital Album Sale Proportion") +labs(fill="< 2004") +
ggtitle("Distribution of Digital Album Sales between Old and New Holiday Music from 2015-2018")
ggplotly(g)
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Digital.Song.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Digital.Song.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + facet_wrap(vars(holiday.year)) + xlab("Artists") +ylab("Digital Song Sale Proportion") +labs(fill="< 2004") +
ggtitle("Distribution of Digital Song Sales between Old and New Holiday Music from 2015-2018" )
ggplotly(g)
# Albums from before 2004 are slightly streamed more, but streaming in general has increased the past few years
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Streaming.On.Demand.Audio...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Streaming.On.Demand.Audio...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + facet_wrap(vars(holiday.year)) + labs(fill="< 2004") +xlab("Artists") +ylab("On Demand Streaming Proportion")+
ggtitle("Distribution of On Demand Streaming between Old and New Holiday Music from 2015-2018")
ggplotly(g)
#divided by Albums.w.TEA.w.SEA.On.Demand to normalize the amounts in order to directly compare
#this shows that more holiday albums are being sold in proportion to the amount of holiday albums existing vs other genres
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Total.Album.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + ylab("Total Album Sale Proportion") + ggtitle("Proportion of Total Album Sales between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
#other genres of music have a higher rate of streaming during the holidays and slight increase in streaming volume
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Total.Streaming.On.Demand/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + ylab("On Demand Streaming Proportion") + ggtitle("Proportion of On Demand Streaming Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
#other genres of music have a higher rate of streaming during the holidays
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Total.Streaming.On.Demand/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + ylab("On Demand Streaming Proportion") + ggtitle("Proportion of On Demand Streaming Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
#holiday music is physically purchased more often than other genres but have both been decreasing over the past 4 years
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Physical.Albums.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ ylab("Physical Album Sales Proportion") + ggtitle("Proportion of Physical Album Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# on christmas day, other digital albums from other genres of music are purchased more, otherwise, digital holiday albums are purchased more
## anomaly, why would other music be purchased more on christmas day, maybe because people have more disposable income on that day
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Digital.Albums.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + ylab("Digital Album Sales Proportion") + ggtitle("Proportion of Digital Album Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
#invdividual song sales are more prevalent in other genres but are both decreasing
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Digital.Song.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ ylab("Digital Song Sale Proportion") + ggtitle("Proportion of Digital Song Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# there has been an increase in programmed streaming in the past 4 years. Holiday music was streamed more frequently than other genres during the holiday season when controlled by the proportion of albums across different genres.
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Total.Streaming.Programmed/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ ylab("Total Programmed Streaming Proportion") + ggtitle("Proportion of Total Programmed Streaming Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# when holiday music is being streamed on the radio, it is more listened to than other genres
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Airplay.Audience/Airplay.Spins,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ ylab("Airplay Interaction Proportion") + ggtitle("Proportion of Airplay Interaction Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# when holiday music is being streamed on the radio, it is more listened to than other genres but there isn't a huge change in Airplay volume
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Airplay.Audience/Airplay.Spins,fill=as.factor(industry))) + geom_col(position='dodge') + ylab("Airplay Interaction Proportion") + ggtitle("Proportion of Airplay Interaction Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# differences in holiday and industry sale/stream volumes during Christmas week
holiday_means = colMeans(daily_holiday_industry[daily_holiday_industry$industry=="holiday",c(2:16)],na.rm = T)
holiday_means_prop = holiday_means[c(2:13)]/holiday_means[1] #proportion to total equivalent album consumption
holiday_means_prop[13] = holiday_means[15]/holiday_means[14] #airplay interaction = audience/spin
names(holiday_means_prop[13]) = "Airplay.Interaction"
industry_means = colMeans(daily_holiday_industry[daily_holiday_industry$industry=="industry",c(2:16)],na.rm = T)
industry_means_prop = industry_means[c(2:13)]/industry_means[1] #proportion to total equivalent album consumption
industry_means_prop[13] = industry_means[15]/industry_means[14] #airplay interaction = audience/spin
#percent difference between holiday
holiday_industry_perc_diff = ((holiday_means_prop-industry_means_prop)/((industry_means_prop+holiday_means_prop)/2)) * 100
holiday_industry_perc_diff
## Albums.w.TEA.w.SEA.On.Demand.Audio Albums.w..TEA
## 8.39826 18.45186
## Total.Album.Sales Physical.Albums.Sales
## 27.89852 30.19676
## Digital.Albums.Sales Digital.Song.Sales
## 21.60200 -44.20631
## Total.Streaming.On.Demand Streaming.On.Demand.Audio
## -22.02095 -4.28224
## Streaming.On.Demand.Video Total.Streaming.Programmed
## -63.90795 43.44929
## Streaming.Programmed.Audio Streaming.Programmed.Video
## 43.44944 -45.26248
##
## 11.78556
#when controlled for different proportions of music published, holiday albums are overtake the sales of other genres around week 28 plus or minus 3 weeks
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Total.Album.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ xlab("Weeks") + ylab("Total Album Sale Proportion") + ggtitle("Proportion of Total Album Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# on average, other genres of music are streamed more than holiday music. Holiday music streaming begins decreasing around 35 plus or minus 3 weeks which is around late August
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Total.Streaming.On.Demand/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ xlab("Weeks") + ylab("Total On Demand Streaming Proportion") + ggtitle("Proportion of On Demand Streaming Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# holiday physical album sales hit a minimum around week 20, which is June. holiday physical album sales hit its max at week 40, which is in november
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Physical.Albums.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + xlab("Weeks") + ylab("PHysical Album Sale Proportion") + ggtitle("Proportion of Physical Album Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# peak of holiday digital album sales occurs around week 43 which is mid october
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Digital.Albums.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ xlab("Weeks") + ylab("Digital Album Sale Proportion") + ggtitle("Proportion of Digital Album Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# There is an overall decline in individual digital song sales across the years. Holiday digital song sales trends follows streaming on demand patterns
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Digital.Song.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))+ xlab("Weeks") + ylab("Digital Song Sale Proportion") + ggtitle("Proportion of Digital Song Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# there has been an increase in streaming programmed music over the years with similar distributions of music streamed across different genres
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Total.Streaming.Programmed/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + xlab("Weeks") + ylab("Total Programmed Streaming Proportion") + ggtitle("Proportion of Total Programmed Streaming Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# we notice that around the holiday season, holiday music is listened to than other genres
## anamoly, in june on week 26, there is a random spike that occurs
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Airplay.Audience/Airplay.Spins,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + xlab("Weeks") + ylab("Airplay Interaction Proportion") + ggtitle("Airplay Interaction Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
there are higher counts in streaming than digital or physical sales across the years. This makes sense as more people switch to streaming methods. Recently iTunes has closed meaning that Apple product people users are more likely to stream. There is probably a reluctancy for people to buy digitally on other platforms due to the continued decrease in non streaming sales. # What do you expect holiday music in 2020 to look like ? What do you expect it to look like in 2021? -what artists, albums, record labels do well # Are there any anomalies in the data? anamoly, in june on week 26, there is a random spike that occurs
In the data, old artists have current release years which throws off visualizations
# we notice that around the holiday season, holiday music is listened to than other genres
## anamoly, in june on week 26, there is a random spike that occurs
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Airplay.Audience/Airplay.Spins,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + xlab("Weeks") + ylab("Airplay Interaction Proportion") + ggtitle("Airplay Interaction Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# on christmas day, other digital albums from other genres of music are purchased more, otherwise, digital holiday albums are purchased more
## anomaly, why would other music be purchased more on christmas day, maybe because people have more disposable income on that day
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Digital.Albums.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + ylab("Digital Album Sales Proportion") + ggtitle("Proportion of Digital Album Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# album_list data (total album equivalent consumption audio, album sales, physical album sales, digital album sales, digital song sales, streaming )
album_highlight_2015 = colMeans(album_list_2015[,c(7:12)],na.rm = T)
album_highlight_2016 = colMeans(album_list_2016[,c(7:12)],na.rm = T)
album_highlight_2017 = colMeans(album_list_2017[,c(7:12)],na.rm = T)
album_highlight_2018 = colMeans(album_list_2018[,c(7:12)],na.rm = T)
album_highlight_change = ((album_highlight_2018-album_highlight_2015)/album_highlight_2015)*100 # in percent
# daily holiday highlight on
# "Albums.w.TEA.w.SEA.On.Demand, "Albums.w.TEA.w.SEA.On.Demand.Audio" "Albums.w..TEA" ,"Total.Album.Sales", "Physical.Albums.Sales","Digital.Albums.Sales" "Digital.Song.Sales","Total.Streaming.On.Demand", "Streaming.On.Demand.Audio","Streaming.On.Demand.Video","Total.Streaming.Programmed","Streaming.Programmed.Audio" "Streaming.Programmed.Video" "Airplay.Spins" "Airplay.Audience"
daily_holiday_highlight_2016 = colMeans(daily_holiday_2016[,c(2:16)],na.rm = T)
daily_holiday_highlight_2017 = colMeans(daily_holiday_2017[,c(2:16)],na.rm = T)
daily_holiday_highlight_2018 = colMeans(daily_holiday_2018[,c(2:16)],na.rm = T)
daily_holiday_highlight_2019 = colMeans(daily_holiday_2019[,c(2:16)],na.rm = T)
daily_holiday_highlight_change = ((daily_holiday_highlight_2019-daily_holiday_highlight_2016)/daily_holiday_highlight_2016)*100
# daily industry highlight
daily_industry_highlight_2016 = colMeans(daily_industry_2016[,c(2:16)],na.rm = T)
daily_industry_highlight_2017 = colMeans(daily_industry_2017[,c(2:16)],na.rm = T)
daily_industry_highlight_2018 = colMeans(daily_industry_2018[,c(2:16)],na.rm = T)
daily_industry_highlight_2019 = colMeans(daily_industry_2019[,c(2:16)],na.rm = T)
daily_industry_highlight_change = ((daily_industry_highlight_2019-daily_industry_highlight_2016)/daily_industry_highlight_2016)*100
# weekly holiday highlight
weekly_holiday_highlight_2015 = colMeans(weekly_holiday_2015[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2016 = colMeans(weekly_holiday_2016[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2017 = colMeans(weekly_holiday_2017[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2018 = colMeans(weekly_holiday_2018[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2019 = colMeans(weekly_holiday_2019[,c(2:16)],na.rm = T)
weekly_holiday_highlight_change = ((weekly_holiday_highlight_2019-weekly_holiday_highlight_2015)/weekly_holiday_highlight_2015) * 100
# weekly industry highlight
weekly_industry_highlight_2015 = colMeans(weekly_industry_2015[,c(2:16)],na.rm = T)
weekly_industry_highlight_2016 = colMeans(weekly_industry_2016[,c(2:16)],na.rm = T)
weekly_industry_highlight_2017 = colMeans(weekly_industry_2017[,c(2:16)],na.rm = T)
weekly_industry_highlight_2018 = colMeans(weekly_industry_2018[,c(2:16)],na.rm = T)
weekly_industry_highlight_2019 = colMeans(weekly_industry_2019[,c(2:16)],na.rm = T)
weekly_industry_highlight_change = ((weekly_industry_highlight_2019-weekly_industry_highlight_2015)/weekly_industry_highlight_2015) * 100
# song list ytd audio
song_list_2016_audio = mean(song_list_2016$YTD.Audio)
song_list_2019_audio = mean(song_list_2019$YTD.Audio)
song_list_audio_change = ((song_list_2019_audio-song_list_2016_audio)/song_list_2016_audio) * 100